Jornada de introducción a la Genómica y la Bioinformática

Machine learning y bioinformática: análisis de componentes principales

Profesor: Carlos Pérez-Glez (departamento de Matemáticas, Estadística e Investigación Operativa)

Junio, 2019

PCA aplicado a DNA microarrays

PCA en microarrays: expresión génica en tejidos tumorales

datos.genes <- url("https://github.com/cpgonzal/cursoBioPCA/blob/master/datos_genes.RData?raw=true")
load(datos.genes)
head(datos_trans)
##             gene_1     gene_2     gene_3     gene_4      gene_5
## tumor_a  0.8726627  1.3464477  0.1580743 -0.5027416 -0.75876070
## tumor_b  0.8806472  0.7498311 -0.3363201 -0.5912593 -0.86344136
## tumor_c  1.3893236  1.2426073 -0.1222758 -0.5823842 -1.28813620
## tumor_d -0.8234644 -1.3032958 -0.6156777 -0.3855728 -0.34297901
## tumor_e -1.1174522 -1.1774594 -0.9522566 -0.3298219  0.34540177
## tumor_f -0.9308277 -0.7110086 -1.2965523 -0.4010737  0.05903187
##              gene_6     gene_7     gene_8    gene_9
## tumor_a -1.01691819  0.1126525 0.32300317 1.0846077
## tumor_b -1.04780750 -0.2406702 0.70238943 1.1404019
## tumor_c -0.85190531  0.3247319 0.20602104 0.7595605
## tumor_d  0.07098378  1.0210364 0.69743729 0.9916284
## tumor_e -0.52650153  1.1487223 0.00599308 1.2312660
## tumor_f  0.20957245  0.9537663 0.72007731 0.9110078

PCA en microarrays: matriz de var-covar de los datos

var(datos_trans)   # matriz S de var-cov.
##             gene_1      gene_2     gene_3     gene_4      gene_5
## gene_1  1.00117018  1.04167864  0.6793231  0.2326529 -0.04149816
## gene_2  1.04167864  1.16089133  0.6894438  0.2275600 -0.02704006
## gene_3  0.67932307  0.68944378  0.7561245  0.4639694  0.41382428
## gene_4  0.23265295  0.22756001  0.4639694  0.4364229  0.54997137
## gene_5 -0.04149816 -0.02704006  0.4138243  0.5499714  0.86032584
## gene_6  0.10243798  0.07268947  0.5220257  0.6114519  0.83813275
## gene_7 -0.70688718 -0.72463626 -0.7147269 -0.4538187 -0.40278529
## gene_8 -0.11621339 -0.16529264 -0.1819963 -0.1237691 -0.21090807
## gene_9 -0.52214230 -0.50594909 -0.7836571 -0.6686265 -0.76280996
##             gene_6     gene_7     gene_8     gene_9
## gene_1  0.10243798 -0.7068872 -0.1162134 -0.5221423
## gene_2  0.07268947 -0.7246363 -0.1652926 -0.5059491
## gene_3  0.52202568 -0.7147269 -0.1819963 -0.7836571
## gene_4  0.61145185 -0.4538187 -0.1237691 -0.6686265
## gene_5  0.83813275 -0.4027853 -0.2109081 -0.7628100
## gene_6  0.98873247 -0.5058350 -0.1472066 -0.9285206
## gene_7 -0.50583504  0.7732758  0.1111573  0.7792427
## gene_8 -0.14720664  0.1111573  0.3233331  0.1753145
## gene_9 -0.92852057  0.7792427  0.1753145  1.0818186

PCA en microarrays: tests estadísticos

library(psych)
cortest.bartlett(cor(datos_trans), n=nrow(datos_trans))
## $chisq
## [1] 219.9707
## 
## $p.value
## [1] 2.868623e-28
## 
## $df
## [1] 36
KMO(cor(datos_trans))
## Error in solve.default(r) : 
##   sistema es computacionalmente singular: número de condición recíproco = 3.88383e-18
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(datos_trans))
## Overall MSA =  0.5
## MSA for each item = 
## gene_1 gene_2 gene_3 gene_4 gene_5 gene_6 gene_7 gene_8 gene_9 
##    0.5    0.5    0.5    0.5    0.5    0.5    0.5    0.5    0.5
library(tidyr)
df_box<-gather(data = datos_trans, key = gene, value = expresion)

library(ggplot2)
ggplot(df_box, 
       aes(x = gene, y = expresion, fill="orange")) + geom_boxplot() + 
  scale_fill_discrete(guide=FALSE) + xlab("") +
  ylab("Gene expression")

PCA en microarrays: métodos de PCA en R

res.pca <- prcomp(datos_trans)
#formatC(res.pca$sdev^2,digits=5)   
res.pca$sdev^2      # autovalores de S = varianza de las PCs
## [1] 4.749891e+00 2.094519e+00 3.258526e-01 8.693594e-02 7.534952e-02
## [6] 2.933207e-02 1.588891e-02 4.325098e-03 2.390192e-35
res.pca$rotation    # autovectores de S = coeficientes de las PCs
##                PC1         PC2         PC3         PC4         PC5
## gene_1 -0.32440776 -0.48357572 -0.05264719  0.23092826  0.06430301
## gene_2 -0.33570449 -0.52883524  0.15730951 -0.05116911 -0.59181524
## gene_3 -0.38149923 -0.09411618  0.02869418 -0.05355245  0.70302009
## gene_4 -0.27457943  0.18802273 -0.03011017 -0.09547975  0.03966169
## gene_5 -0.28190849  0.45435076  0.24059899 -0.54247146 -0.25039845
## gene_6 -0.34351016  0.43110479 -0.13119860  0.52639475 -0.24716748
## gene_7  0.38298299  0.11865112  0.19385118  0.49236650 -0.08297692
## gene_8  0.09693233 -0.02348610 -0.91146826 -0.18724291 -0.13684989
## gene_9  0.45561288 -0.18804724  0.16522229 -0.28847569 -0.02013285
##                PC6         PC7         PC8         PC9
## gene_1  0.27789378  0.36325350  0.54470996 -0.30788418
## gene_2 -0.39129871 -0.17786437 -0.18452644  0.10595059
## gene_3 -0.49807487 -0.27810776 -0.01271261 -0.14861997
## gene_4 -0.17869443  0.31243801  0.36286401  0.78513568
## gene_5 -0.13119036  0.19638979  0.19902822 -0.45148462
## gene_6  0.06604420 -0.51825032  0.25577626 -0.04885115
## gene_7 -0.62023896  0.34394225  0.14524390 -0.16813475
## gene_8 -0.28126099  0.04108705  0.08611121 -0.13145080
## gene_9 -0.06536996 -0.48405885  0.63550334  0.06068341
# NOTA: res.pca$rotation%*%diag(res.pca$sdev^2)%*%t(res.pca$rotation) es igual a S

PCA en microarrays: cálculo de las CPs

datos.center<-scale(datos_trans,scale=FALSE)
as.matrix(datos.center)%*%res.pca$rotation[,1]  # coordenadas de los individuos/observaciones en la 1ra PC 
##               [,1]
## tumor_a  0.5183955
## tumor_b  0.8960083
## tumor_c  0.5288076
## tumor_d  2.0720868
## tumor_e  2.3405535
## tumor_f  1.9508547
## tumor_g -2.6457410
## tumor_h -2.6769372
## tumor_i -2.9840281
res.pca$x
##                PC1        PC2         PC3         PC4          PC5
## tumor_a  0.5183955 -1.8589659  0.22540007 -0.22147023 -0.008519502
## tumor_b  0.8960083 -1.6396306 -0.30660024 -0.37474097  0.004122781
## tumor_c  0.5288076 -2.1228344  0.12123349  0.51989858 -0.017368423
## tumor_d  2.0720868  1.2317525 -0.35111394  0.30376631  0.413507629
## tumor_e  2.3405535  1.3910448  0.61141038 -0.32259457  0.140234794
## tumor_f  1.9508547  1.2806624 -0.23981513  0.12737809 -0.554014877
## tumor_g -2.6457410  0.6969827 -0.08106831 -0.03391124 -0.251158694
## tumor_h -2.6769372  0.4544845 -0.96078343 -0.10403275  0.183604752
## tumor_i -2.9840281  0.5665040  0.98133711  0.10570679  0.089591539
##                 PC6         PC7          PC8           PC9
## tumor_a -0.31553450 -0.13015582 -0.031060726  3.330669e-16
## tumor_b  0.30268638 -0.04879218  0.045618874  1.457168e-16
## tumor_c  0.02550008  0.16507342 -0.004797654  1.353084e-16
## tumor_d -0.03093510 -0.13251737  0.052445766 -2.498002e-16
## tumor_e -0.03434690  0.19367901 -0.010923987  3.469447e-17
## tumor_f  0.04640348 -0.06840532 -0.055894212 -4.857226e-17
## tumor_g -0.12184480  0.05163088  0.129225192 -1.665335e-16
## tumor_h -0.02396421  0.07870617 -0.085485097 -2.359224e-16
## tumor_i  0.15203557 -0.10921880 -0.039128157 -1.387779e-17

PCA en microarrays: proporción de varianza explicada por las CPs

PCA en microarrays: proporción de varianza explicada por las CPs

library(ggplot2)
prop_varianza <- res.pca$sdev^2 / sum(res.pca$sdev^2)
formatC(prop_varianza,digits=5)
## [1] "0.64343"    "0.28373"    "0.044141"   "0.011777"   "0.010207"  
## [6] "0.0039734"  "0.0021524"  "0.00058589" "3.2378e-36"
ggplot(data = data.frame(prop_varianza, pc = 1:length(res.pca$sdev) ),
       aes(x = pc, y = prop_varianza)) +
  geom_col(width = 0.3) +
  geom_text(aes(label=formatC(prop_varianza,digits=2)), vjust=-0.5) +
  scale_y_continuous(limits = c(0,1)) +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. de varianza explicada")

ggplot(data = data.frame(prop_varianza, pc = 1:length(res.pca$sdev) ),
       aes(x = pc, y = prop_varianza, group = 1)) +
  geom_point() +
  geom_line() +
  geom_text(aes(label=formatC(prop_varianza,digits=2)), vjust=-0.5) +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. de varianza explicada")

PCA en microarrays: proporción de varianza acumulada explicada por las CPs

prop_varianza_acum <- cumsum(prop_varianza)
ggplot(data = data.frame(prop_varianza_acum, pc = 1:length(res.pca$sdev) ),
       aes(x = pc, y = prop_varianza_acum, group = 1)) +
  geom_point() +
  geom_line() +
  geom_text(aes(label=formatC(prop_varianza_acum,digits=2)), vjust=-0.5) +
  theme_bw() +
  labs(x = "Componente principal",
       y = "Prop. de varianza explicada acumulada")

PCA en microarrays: proporción de varianza acumulada explicada por las CPs

PCA en microarrays: representación de las coordenadas de las observaciones en las CPS

plot_vars<-function(cols) {
  ggplot(data = datos_trans,
         aes_string(x = colnames(datos_trans)[cols[1]], y = colnames(datos_trans)[cols[2]])) +
    geom_point() +
    geom_text(aes(label=rownames(datos_trans)), vjust=-0.5) +
    scale_x_continuous(limits=c(-2,2))+
    scale_y_continuous(limits=c(-2,2))+
    theme_bw() +
    labs(x = colnames(datos_trans)[cols[1]],
         y = colnames(datos_trans)[cols[2]], 
         title=paste0("Plot ",colnames(datos_trans)[cols[1]]," vs ",colnames(datos_trans)[cols[2]])
    )
}

PCA en microarrays: representación de las coordenadas de las observaciones en las CPS

require(gridExtra)
grid.arrange(plot_vars(c(1,2)), 
             plot_vars(c(1,3)), 
             plot_vars(c(2,3)),
             plot_vars(c(2,4)),
             ncol=2)

PCA en microarrays: representación de las coordenadas de las observaciones en las CPS

ggplot(data = data.frame(res.pca$x[,c(1,2)]),
       aes(x = PC1, y = PC2, group = 1)) +
  geom_point() +
  geom_text(aes(label=rownames(datos_trans)), vjust=-0.5) +
  scale_x_continuous(limits=c(-3.5,3.5))+
  scale_y_continuous(limits=c(-3,3))+
  theme_bw() +
  labs(x = "1ra componente principal",
       y = "2nda componente principal",
       title="Plot PC1 vs PC2")

PCA en microarrays: representación de las coordenadas de las observaciones en las CPS

res.pca$rotation[,1]  # 1er autovector
##      gene_1      gene_2      gene_3      gene_4      gene_5      gene_6 
## -0.32440776 -0.33570449 -0.38149923 -0.27457943 -0.28190849 -0.34351016 
##      gene_7      gene_8      gene_9 
##  0.38298299  0.09693233  0.45561288
res.pca$rotation[,2]  # 2ndo autovector
##      gene_1      gene_2      gene_3      gene_4      gene_5      gene_6 
## -0.48357572 -0.52883524 -0.09411618  0.18802273  0.45435076  0.43110479 
##      gene_7      gene_8      gene_9 
##  0.11865112 -0.02348610 -0.18804724
ggplot(data = data.frame(res.pca$x[,c(1,2)]),
       aes(x = PC1, y = PC2, group = 1)) +
  geom_point() +
  geom_text(aes(label=rownames(datos_trans)), vjust=-0.5) +
  scale_x_continuous(limits=c(-3.5,3.5))+
  scale_y_continuous(limits=c(-3,3))+
  theme_bw() +
  labs(x = "1ra componente principal",
       y = "2nda componente principal",
       title="Plot PC1 vs PC2")

PCA en microarrays: interpretación de las componentes principales

Otros comandos relacionados: prcomp() vs. princomp()

res.pca2 <- princomp(datos_trans)
res.pca2$sdev^2      # varianza de las PC, aprox. similar a res.pca$sdev^2 
res.pca2$loadings    # autovectores de S , aprox. similar a res.pca$rotation 
res.pca2$scores      # puntuaciones factoriales, aprox. similar a res.pca$x

Otros comandos relacionados: prcomp() vs. princomp()

The calculation is done by a singular value decomposition of the (centered and possibly scaled) data matrix, not by using eigen on the covariance matrix. This is generally the preferred method for numerical accuracy.

mientras que en princomp se dice

The calculation is done using eigen on the correlation or covariance matrix, as determined by cor. This is done for compatibility with the S-PLUS result. A preferred method of calculation is to use svd on x, as is done in prcomp.

Mejorando el análisis de PCA: el paquete FactoMineR y el método PCA()

library("FactoMineR")
PCA(X, scale.unit = TRUE, ncp = 5, graph = TRUE)  

Mejorando el análisis de PCA: el paquete FactoMineR y el método PCA()

library("FactoMineR")
res.pca3 <- PCA(datos_trans, scale.unit = TRUE, ncp = 9, graph = FALSE)
res.pca3$eig         # autovalores 
##         eigenvalue percentage of variance
## comp 1 5.684355812            63.15950902
## comp 2 2.175108748            24.16787497
## comp 3 0.897354696             9.97060773
## comp 4 0.098520356             1.09467062
## comp 5 0.084511221             0.93901357
## comp 6 0.037312857             0.41458730
## comp 7 0.018166298             0.20184776
## comp 8 0.004670012             0.05188903
##        cumulative percentage of variance
## comp 1                          63.15951
## comp 2                          87.32738
## comp 3                          97.29799
## comp 4                          98.39266
## comp 5                          99.33168
## comp 6                          99.74626
## comp 7                          99.94811
## comp 8                         100.00000
res.pca3$var$coord   # loadings = autovectores ponderados por autovalor 
##             Dim.1       Dim.2       Dim.3        Dim.4       Dim.5
## gene_1  0.6466620  0.75514735  0.02530764  0.070968527  0.01518013
## gene_2  0.6210476  0.75652951 -0.06831293 -0.027695398  0.16560581
## gene_3  0.9440757  0.23545792  0.01108830  0.007017624 -0.21451984
## gene_4  0.9343393 -0.33723391  0.08328456 -0.024031097 -0.01447496
## gene_5  0.7228027 -0.66021466 -0.07557126 -0.173456869  0.06507582
## gene_6  0.7934604 -0.56153744  0.11699316  0.172343407  0.07721584
## gene_7 -0.9261780 -0.27616575 -0.16156359  0.145210419  0.01221888
## gene_8 -0.4147898  0.05816376  0.90689957 -0.027475582  0.01195192
## gene_9 -0.9666105  0.18027465 -0.13046167 -0.102271890 -0.01174382
##              Dim.6         Dim.7        Dim.8
## gene_1 -0.03333382 -0.0590339361  0.032819955
## gene_2  0.08715362  0.0369032081 -0.012102062
## gene_3  0.07195041  0.0436788595 -0.002248936
## gene_4  0.04800057 -0.0518184466  0.023891398
## gene_5  0.03802209 -0.0126292082  0.005875990
## gene_6 -0.02761028  0.0679341196  0.019158673
## gene_7  0.13232499 -0.0324355594  0.005158757
## gene_8  0.03459674  0.0008030624  0.002046340
## gene_9  0.01449830  0.0538442196  0.049377769
res.pca3$ind$coord   # puntuaciones factoriales 
##              Dim.1      Dim.2      Dim.3       Dim.4       Dim.5
## tumor_a -0.8077113  1.9343199 -0.2877048 -0.24634795 -0.07071988
## tumor_b -1.2807803  1.7543961  0.4831515 -0.42479725  0.02056229
## tumor_c -0.9232763  2.2439025 -0.4177970  0.56527325  0.08476949
## tumor_d -2.2531967 -1.3619120  0.4580948  0.39285885 -0.47020407
## tumor_e -2.3409193 -1.7035043 -0.9923015 -0.38499467 -0.16690789
## tumor_f -2.2247307 -1.5032677  0.4482729  0.11169530  0.61391373
## tumor_g  3.1202644 -0.6386387  0.3225145 -0.04507721  0.26280481
## tumor_h  2.9965035 -0.2323599  1.7372830 -0.04930481 -0.22510520
## tumor_i  3.7138469 -0.4929359 -1.7515133  0.08069450 -0.04911329
##                Dim.6       Dim.7       Dim.8
## tumor_a  0.345270482  0.19016035 -0.03147672
## tumor_b -0.360896615  0.00896776  0.05735906
## tumor_c  0.012446380 -0.18920811 -0.01322819
## tumor_d -0.029902956  0.12824184  0.06512958
## tumor_e  0.087974038 -0.21091932 -0.02308535
## tumor_f -0.047562386  0.09652468 -0.05744554
## tumor_g  0.191667353 -0.06068302  0.13596701
## tumor_h -0.002619894 -0.07330765 -0.09931086
## tumor_i -0.196376401  0.11022348 -0.03390899

Análisis del porcentaje de varianza explicada

library("factoextra")
fviz_eig(res.pca3, addlabels = TRUE, ylim = c(0, 80))

PCA en microarrays: análisis de las variables en PCA

Análisis de la correlación entre las CP y las variables originales (loadings)

El concepto de loadings en PCA

Gráfico de correlación de variables y componentes principales

res.pca3$var$cor
res.pca3$var$cor[,1:4]
##             Dim.1       Dim.2       Dim.3        Dim.4
## gene_1  0.6466620  0.75514735  0.02530764  0.070968527
## gene_2  0.6210476  0.75652951 -0.06831293 -0.027695398
## gene_3  0.9440757  0.23545792  0.01108830  0.007017624
## gene_4  0.9343393 -0.33723391  0.08328456 -0.024031097
## gene_5  0.7228027 -0.66021466 -0.07557126 -0.173456869
## gene_6  0.7934604 -0.56153744  0.11699316  0.172343407
## gene_7 -0.9261780 -0.27616575 -0.16156359  0.145210419
## gene_8 -0.4147898  0.05816376  0.90689957 -0.027475582
## gene_9 -0.9666105  0.18027465 -0.13046167 -0.102271890

se aprecia que la 1ra CP tiene mucha correlación con todas las variables (medida global de la expresión de todos los genes) mientras que la 2nda CP tiene mayor correlación en los genes 5-6 (signo -) y los genes 1-2 (signo +), con lo cual, esa 2nda CP es una medida que compara la expresión de ambos cojuntos de genes en los tumores.

fviz_pca_var(res.pca3, col.var = "black")

Interpretación de las componentes principales

Calidad de la representación en componentes principales

# correlaciones al cuadrado=coseno cuadrado
# del ángulo de X_i y su proyección en Y_j
# la suma de cada fila (variable) es = 1
res.pca3$var$cos2   
res.pca3$var$cos2[,1:4]
##            Dim.1       Dim.2        Dim.3        Dim.4
## gene_1 0.4181717 0.570247520 0.0006404767 5.036532e-03
## gene_2 0.3857001 0.572336896 0.0046666560 7.670351e-04
## gene_3 0.8912788 0.055440432 0.0001229504 4.924704e-05
## gene_4 0.8729900 0.113726708 0.0069363185 5.774936e-04
## gene_5 0.5224437 0.435883397 0.0057110156 3.008729e-02
## gene_6 0.6295793 0.315324302 0.0136874003 2.970225e-02
## gene_7 0.8578057 0.076267519 0.0261027929 2.108607e-02
## gene_8 0.1720506 0.003383023 0.8224668380 7.549076e-04
## gene_9 0.9343358 0.032498949 0.0170202474 1.045954e-02

Calidad de la representación en componentes principales

library("corrplot")
corrplot(res.pca3$var$cos2[,1:3], is.corr=FALSE,cl.align.text="l")

fviz_cos2(res.pca3, choice = "var", axes = 1:2) + 
  ylab("Comunalidad")+ggtitle("Comunalidad de 1ra y 2nda CP")

Calidad de la representación en componentes principales

fviz_pca_var(res.pca3, col.var = "cos2", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE 
             )

Contribución de variables a las componentes principales

# contribución de las X_i a cada Y_j
# la suma de cada columna (componente) es = 100
res.pca3$var$contrib   
res.pca3$var$contrib[,1:4]
##            Dim.1      Dim.2       Dim.3       Dim.4
## gene_1  7.356537 26.2169660  0.07137386  5.11217380
## gene_2  6.785291 26.3130244  0.52004586  0.77855491
## gene_3 15.679505  2.5488579  0.01370143  0.04998667
## gene_4 15.357764  5.2285528  0.77297400  0.58616680
## gene_5  9.190905 20.0396140  0.63642789 30.53915610
## gene_6 11.075650 14.4969442  1.52530547 30.14833805
## gene_7 15.090641  3.5063773  2.90886012 21.40275044
## gene_8  3.026739  0.1555335 91.65459789  0.76624533
## gene_9 16.436969  1.4941299  1.89671347 10.61662790

Contribución de variables a las componentes principales

library("corrplot")
corrplot(res.pca3$var$contrib[,1:3], is.corr=FALSE,cl.align.text="l")

Contribución de variables a las componentes principales

# Contributions of variables to PC1
chart01<-fviz_contrib(res.pca3, choice = "var", axes = 1, top = 10) +ggtitle("Contrib. a CP 1")
# Contributions of variables to PC2
chart02<-fviz_contrib(res.pca3, choice = "var", axes = 2, top = 10) +ggtitle("Contrib. a CP 2")
# Contributions of variables to PC1 and PC2
chart03<-fviz_contrib(res.pca3, choice = "var", axes = 1:2, top = 10) +ggtitle("Contrib. a CP 1-2")

require(gridExtra)
grid.arrange(chart01,
             chart02,
             chart03,
             ncol=3)

Contribución de variables a las componentes principales

fviz_pca_var(res.pca3, col.var = "contrib", 
             gradient.cols = c("blue", "yellow", "red")
             )

Coffee-break